import numpy as np
import pandas as pd
import dateutil.parser
import datetime
from datetime import timedelta
import math
import calendar
import statsmodels.api as sm
import statsmodels
import seaborn as sns
from numpy import linalg
import patsy
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn import feature_selection as f_select
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from pymongo import MongoClient
from statsmodels.formula.api import ols
import matplotlib.pyplot as plt
import yellowbrick as yb
from yellowbrick.regressor.residuals import ResidualsPlot
%matplotlib inline
The data from our scraping resides in a Mongo database.
Let's go get it...
client = MongoClient()
db = client.willco
df = pd.DataFrame(
list(db.PropertyInfo.find())
)
Pulling from Mongo doesn't automatically set NaN the way read_csv does.
So, let's take care of that here.
df.replace('N/A',np.NaN,inplace=True)
df.shape
Pulling from Mongo orders columns alphabetically (and adds an ID column). Reorganizing the data columns to be more in line with how it's presented on the website...
df = df[[
'PIN', 'PropClass', 'Address', 'City', 'Zip', 'SaleDate', 'SaleAmt',
'TaxRate', 'ASL', 'ASFL', 'AI', 'ASB', 'ASFB', 'ASTotal', 'ASFTotal',
'Subdivision', 'FullBath', 'Style', 'HalfBath', 'LivingSqFt', 'CentralAir',
'BldgSqFt', 'Fireplace', 'YearBuilt', 'Porch', 'Basement', 'Attic',
'Garage', 'Lot', 'Block', 'Unit', 'Building', 'Area', 'Status'
]]
df.head()
df.shape
We've scraped more data than we need. Let's pare it down...
df.PropClass.value_counts()
df = df[df['PropClass'] == 'RESIDENTIAL']
#df = df[df['SaleDate'].notnull()]
df.dropna(subset=['SaleDate'],inplace=True)
df.shape
The data is stored in the format or fashion as it existed on the website. We need a number of transformations in order to get the data where we can use it.
df['Section'] = df['PIN'].apply(lambda x: x[:8])
gx=df.groupby('Section').count()['PIN']
gx[df.tail().Section]
gx['06-03-35']
def masseuse(data):
''' Initial field transformations to facilitate further analysis
Input: data - the dataframe
Changes are made in place.
'''
data['Section'] = data['PIN'].apply(lambda x: x[:8])
gx = data.groupby('Section').count()['PIN']
data['Density'] = data['Section'].apply(lambda x: gx[x]).astype(float)
data['SaleDate'] = data['SaleDate'].apply(lambda x: dateutil.parser.parse(x))
data['Longevity'] = data['SaleDate'].apply(lambda x: dateutil.parser.parse('2018-01-01') - x)
data['Longevity']=data['Longevity'].apply(lambda x: x.total_seconds()/86400.)
data['SaleAmt']=data['SaleAmt'].replace('[\$,]', '', regex=True).astype(float)
data['TaxRate']=data['TaxRate'].replace('\(\d+\)', '', regex=True).astype(float)
data['ASTotal']=data['ASTotal'].replace('[,]', '', regex=True).astype(float)
data['ASFTotal']=data['ASFTotal'].replace('[,]', '', regex=True).astype(float)
data['LivingSqFt']=data['LivingSqFt'].replace(' Sq. Feet', '', regex=True).astype(float)
data['BldgSqFt']=data['BldgSqFt'].replace(' Sq. Feet', '', regex=True).astype(float)
data['YearBuilt'] = data['YearBuilt'].astype(float)
data['FullBath'] = data['FullBath'].astype(float)
data['HalfBath'] = data['HalfBath'].astype(float)
data['CentralAir'].replace('YES',1.,inplace=True)
data['CentralAir'].replace('NO',0.,inplace=True)
data['CentralAir'] = data['CentralAir'].astype(float)
data['Fireplace'].replace('YES',1.,inplace=True)
data['Fireplace'].replace('NO',0.,inplace=True)
data['Fireplace'] = data['Fireplace'].astype(float)
data['Porch'].replace('YES',1.,inplace=True)
data['Porch'].replace('NO',0.,inplace=True)
data['Porch'] = data['Porch'].astype(float)
masseuse(df)
For the most part, the records available from our data source either has complete information or a portion of the information is simply marked unavailable. Therefore, when this portion of data is missing for a feature, it's likely missing for the same set of features.
df2 = df[[
'Longevity',
'SaleAmt',
'TaxRate',
'ASTotal',
'LivingSqFt',
'YearBuilt',
'FullBath',
'HalfBath',
'Fireplace',
'CentralAir',
'Porch',
'Section',
'Density'
]]
df2=df2[df2.notnull().all(axis=1)]
df2.shape
Earlier exploration with smaller datasets showed that a number of our features have significant outliers. Switching to log helped to capture the data better but there were still outliers.
The primary features to treat here are:
We could do the same for a few other fields. But there are a number of "sets" or groupings of these features. Of these, I've chosen one rather than continue with the colinearity issues. For example, I'm using LivingSqFt and ignoring BldgSqFt.
df2 = df2[
(df2['SaleAmt'] != 0) &
(df2['LivingSqFt'] != 0) &
(df2['ASTotal'] != 0)
]
df2.shape
def explore_1(feature):
fig=plt.figure(figsize=(15,4))
ax = plt.subplot2grid((3, 2), (0, 0))
bp = ax.boxplot(
df2[feature],
sym='+', vert=False, patch_artist=True)
ax = plt.subplot2grid((3, 2), (0, 1))
bp = ax.hist(
df2[feature],
bins=100
)
log_feature = np.log(df2[feature])
ax = plt.subplot2grid((3, 2), (1, 0))
bp = ax.boxplot(
log_feature,
sym='+', vert=False, patch_artist=True)
ax = plt.subplot2grid((3, 2), (1, 1))
bp = ax.hist(
log_feature,
bins=100
)
q75, q25 = np.percentile(log_feature, [75 ,25])
iqr = q75 - q25
lmin = q25 - (iqr*1.5)
lmax = q75 + (iqr*1.5)
log_trimmed = log_feature[log_feature > lmin]
log_trimmed = log_trimmed[log_trimmed < lmax]
ax = plt.subplot2grid((3, 2), (2, 0))
bp = ax.boxplot(
log_trimmed,
sym='+', vert=False, patch_artist=True)
ax = plt.subplot2grid((3, 2), (2, 1))
bp = ax.hist(
log_trimmed,
bins=100
)
explore_1('Longevity')
explore_1('SaleAmt')
explore_1('ASTotal')
explore_1('LivingSqFt')
We can use patsy to do log transformation. But let's do it here so we can further apply IQR based removal
for feature in ['SaleAmt','ASTotal','LivingSqFt','Longevity']:
logf = 'Log_' + feature
df2[logf] = np.log(df2[feature])
q75, q25 = np.percentile(df2[logf], [75 ,25])
iqr = q75 - q25
lmin = q25 - (iqr*1.5)
lmax = q75 + (iqr*1.5)
df2 = df2[df2[logf]>lmin]
df2 = df2[df2[logf]<lmax]
df2.shape
# A quick cut here for prototyping...
df2 = df2[:1000]
df2.corr().sort_values('Longevity')
y, X = patsy.dmatrices('Log_Longevity ~ Log_SaleAmt + TaxRate + Log_ASTotal + Log_LivingSqFt + YearBuilt + Fireplace + CentralAir + Porch', data=df2, return_type="dataframe")
X.head()
X = StandardScaler().fit_transform(X)
Text
y.mean()
reg = LinearRegression()
scores = cross_val_score(reg, X, y, cv=10, scoring='neg_mean_squared_error')
# scores output is negative, a sklearn quirk bc mse is used to min. optimization func.
print(np.array([math.sqrt(x) for x in -scores]))
reg = LinearRegression()
degree=2
est = make_pipeline(PolynomialFeatures(degree), LinearRegression())
scores = cross_val_score(est, X, y, cv=10, scoring='neg_mean_squared_error')
# scores output is negative, a sklearn quirk bc mse is used to min. optimization func.
print(np.array([math.sqrt(x) for x in -scores]))
reg = LinearRegression()
scoring = ['neg_mean_squared_error', 'r2']
scores = cross_validate(reg, X, y, scoring=scoring,cv=10, return_train_score=True)
scores
kf = KFold(n_splits=5, shuffle=True)
# create a linear regression model
lm = LinearRegression()
all_scores=[]
# get indices of corresponding train & test
for train,test in kf.split(X):
x_train=X.iloc[train]
y_train=y.iloc[train]
x_test=X.iloc[test]
y_test=y.iloc[test]
pvals=[]
sig_cols=[]
# within each training set, find the significant variables
for feature in x_train.columns:
pval=f_select.f_regression(x_train[[feature]], y_train)
if pval[1][0]<.01:
sig_cols.append(feature)
pvals.append(pval[1][0])
lm.fit(x_train[sig_cols],y_train)
print(sig_cols)
r_2=lm.score(x_test[sig_cols],y_test)
all_scores.append(r_2)
np.mean(all_scores)
alpha = 0.01
degree=2
est = make_pipeline(PolynomialFeatures(degree), Ridge(alpha=alpha))
scoring = ['neg_mean_squared_error', 'r2','neg_median_absolute_error']
scores = cross_validate(est, X, y, scoring=scoring,cv=10, return_train_score=True)
scores
scores['test_neg_median_absolute_error'].mean()/365
alpha = 100
degree=2
est = make_pipeline(PolynomialFeatures(degree), Lasso(alpha=alpha))
scoring = ['neg_mean_squared_error', 'r2']
scores = cross_validate(est, X, y, scoring=scoring,cv=10, return_train_score=True)
scores
First, my own attempt at developing this plot...
def sample_test_train(N,k,size,seed=False):
''' Create a set of train/test samples.
The goal here is random sets of a certain size.
There is no intention of these sets being non-overlapping.
Size is same for both test and train unless the requested
size is more than half of the N provided.
Input:
N the size of the data
k the number or samples desired
size the size of the samples desired
Output:
List of pairs of list of indices
'''
if seed:
np.seed(seed)
output = []
for i in range(k):
indices = np.random.permutation(N)
output.append([indices[:size],indices[size:min(size*2,N)]])
return output
def learning_plot(est,X,y,points=10,rep=10):
""" Create a plot to visualize the learning of the model.
Inputs:
est the estimator
X the features
y the target
points the number of tests
rep the number of repetitions at each test size
"""
x = np.arange(points)
train_err = np.array([])
test_err = np.array([])
N = len(X)
sizes = [math.ceil(0.9 * N * (1 + i)/points) for i in x]
for size in sizes:
train = np.array([])
test = np.array([])
for train_ind,test_ind in sample_test_train(len(X),rep,size):
X_train = X.iloc[train_ind]
X_test = X.iloc[test_ind]
y_train = y.iloc[train_ind]
y_test = y.iloc[test_ind]
est.fit(X.iloc[train_ind],y.iloc[train_ind])
train = np.append(train,
metrics.mean_squared_error(
est.predict(X.iloc[train_ind]),
y.iloc[train_ind]
))
test = np.append(test,
metrics.mean_squared_error(
est.predict(X.iloc[test_ind]),
y.iloc[test_ind]
))
train_err = np.append(train_err,train.mean())
test_err = np.append(test_err,test.mean())
plt.figure(figsize=(8,6))
plt.scatter(sizes, train_err,c='b')
plt.scatter(sizes, test_err,c='r')
est = make_pipeline(LinearRegression())
learning_plot(est,X,y,20,100)
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
"""
Generate a simple plot of the test and training learning curve.
Parameters
----------
estimator : object type that implements the "fit" and "predict" methods
An object of that type which is cloned for each validation.
title : string
Title for the chart.
X : array-like, shape (n_samples, n_features)
Training vector, where n_samples is the number of samples and
n_features is the number of features.
y : array-like, shape (n_samples) or (n_samples, n_features), optional
Target relative to X for classification or regression;
None for unsupervised learning.
ylim : tuple, shape (ymin, ymax), optional
Defines minimum and maximum yvalues plotted.
cv : int, cross-validation generator or an iterable, optional
Determines the cross-validation splitting strategy.
Possible inputs for cv are:
- None, to use the default 3-fold cross-validation,
- integer, to specify the number of folds.
- An object to be used as a cross-validation generator.
- An iterable yielding train/test splits.
For integer/None inputs, if ``y`` is binary or multiclass,
:class:`StratifiedKFold` used. If the estimator is not a classifier
or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.
Refer :ref:`User Guide <cross_validation>` for the various
cross-validators that can be used here.
n_jobs : integer, optional
Number of jobs to run in parallel (default 1).
"""
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.4,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.4, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
return plt
est = make_pipeline(LinearRegression())
plot_learning_curve(
est,
"Learning Curves (Linear Regression)",
X[:1000], y[:1000],
cv = ShuffleSplit(n_splits=100, test_size=0.2),
n_jobs=4,
#ylim=(0.7, 1.01)
)
est = make_pipeline(LinearRegression())
plot_learning_curve(
est,
"Learning Curves (Linear Regression)",
X, y,
cv = ShuffleSplit(n_splits=100, test_size=0.2),
n_jobs=4,
#ylim=(0.7, 1.01)
)
alpha = 0.01
degree=2
est = make_pipeline(PolynomialFeatures(degree), Ridge(alpha=alpha))
plot_learning_curve(
est,
"Learning Curves (Ridge Regression)",
X, y,
cv = ShuffleSplit(n_splits=100, test_size=0.2),
n_jobs=4,
#ylim=(0.7, 1.01)
)
est = make_pipeline(LinearRegression())
# Create the train and test data
X_train, X_test, y_train, y_test = train_test_split(X[:1000], y[:1000], test_size=0.2)
visualizer = ResidualsPlot(est)
visualizer.fit(X_train, y_train) # Fit the training data to the visualizer
visualizer.score(X_test, y_test) # Evaluate the model on the test data
visualizer.ax.get_yaxis().set_ticks([])
g = visualizer.poof() # Draw/show/poof the data
alpha = 0.01
degree=2
est = make_pipeline(PolynomialFeatures(degree), Ridge(alpha=alpha))
# Create the train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
visualizer = ResidualsPlot(est)
visualizer.fit(X_train, y_train) # Fit the training data to the visualizer
visualizer.score(X_test, y_test) # Evaluate the model on the test data
g = visualizer.poof() # Draw/show/poof the data